library(Signac)
library(Seurat)
library(ggplot2)
library(ggpubr)
library(tidyverse)
library(plyr)
library(reshape2)
library(data.table)
library(GenomicRanges)
library(ensembldb)
library(EnsDb.Hsapiens.v86)
library(hdf5r)
library(stringr)
library(ggpubr)
library(RColorBrewer)
library(magick)
library(knitr)
library(kableExtra)
library(scater)
set.seed(123)
Here we set up the parameters of ATAC and RNA seq analysis.
# Thresholds
# ATAC-seq
TSS_enrichment <- 2
nucleosome_signal_atac <- 2
min_lib_size_atac<- 300
max_lib_size_atac <- 200000
#RNA-seq
min_lib_size_rna<-470
max_lib_size_rna<-40000
min_ngenes_rna<-250
max_ngenes_rna<-7000
max_percent_mit <-20
PATH
path_to_save <- "results/R_objects/"
Here we will set the functions that will be used in the quality control (QC) process
#function that creates list of the sample path of the file h5
make.sample.path <- function(path){
sample.path=list()
sample.path <-list.files(path=path,
pattern="h5", recursive = T, full.names = TRUE)
return(sample.path)
}
make.fragpath <- function(path){
sample.path<-list()
sample.path <-list.files(path=path,
pattern=".tsv.gz$", recursive = T, full.names = TRUE)
return(sample.path)
}
In this function called make.SeuratObject.list we will create a list of multiome of seurat object with chromatin assay data.
It has the 2 previous function nested.
make.SeuratObject.list <- function(path){
sample.name <-list()
sample.pathList <-list()
tonsil_list <-list()
fragpathList<-list()
# get gene annotations for hg38
sample.pathList<- make.sample.path(path)
fragpathList<- make.fragpath(path)
sample.name<- sub(".*Experiment/.*\\/(.*)\\/filtered_feature_bc_matrix.h5","\\1", sample.pathList, perl = TRUE )
annotation <- GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v86)
seqlevelsStyle(annotation) <- "UCSC"
genome(annotation) <- "hg38"
for (i in seq_along(sample.pathList)){
counts = Read10X_h5(filename=sample.pathList[[i]])
fragpath <-fragpathList[[i]]
tonsil<-CreateSeuratObject(counts = counts$`Gene Expression`, assay = "RNA")
tonsil[["ATAC"]] <- CreateChromatinAssay(
counts = counts$Peaks,
sep = c(":", "-"),
genome="hg38",
fragments = fragpath,
annotation = annotation
)
tonsil_list[[sample.name[i]]] = assign(sample.name[[i]], tonsil, envir=.GlobalEnv)
}
return(tonsil_list)
}
Once we set all the quality control metric we will use this function to filter out all the cell that are not within the thresholds.
filtering.cell <- function(seurat_object.list){
seurat_object <- subset(x = seurat_object.list,
subset = nCount_ATAC < max_lib_size_atac &
nCount_RNA < max_lib_size_rna &
nCount_ATAC > min_lib_size_atac &
nCount_RNA > min_lib_size_rna &
nFeature_RNA > min_ngenes_rna &
TSS.enrichment > TSS_enrichment &
nucleosome_signal < nucleosome_signal_atac &
percent.mt < max_percent_mit
)
return(seurat_object)
}
tecnical.cell <- function(seurat_object.list){
seurat_object <- subset(x = seurat_object.list,
nFeature_RNA < min_ngenes_rna
)
return(seurat_object)
}
Function to save the filtered out cells. (it is not working)
filter.out.cell <- function(seurat_object.list){
seurat_object <- subset(x = seurat_object.list,
!(subset = nCount_ATAC < max_lib_size_atac &
nCount_RNA < max_lib_size_rna &
nCount_ATAC > min_lib_size_atac &
nCount_RNA > min_lib_size_rna &
nFeature_RNA > min_ngenes_rna &
TSS.enrichment > TSS_enrichment &
percent.mt < max_percent_mit &
nucleosome_signal < nucleosome_signal_atac)
)
return(seurat_object)
}
This function will carry out the Nucleosome signal using NucleosomeSignalfunction, the TSS enrichement using
TSSEnrichmentfunction and also it will calculate the fraction of mitocondria and ribosomal RNA
in each sample.
- Michochondrial genes are useful indicators of cell state.
- We can define ribosomal proteins (their names begin with RPS or RPL), which often
take substantial fraction of reads:
QC <- function(SeuratObject.list){
DefaultAssay(SeuratObject.list) <- "ATAC"
SeuratObject.list <- NucleosomeSignal(SeuratObject.list)
SeuratObject.list <- TSSEnrichment(SeuratObject.list, fast = FALSE)
SeuratObject.list$tss.level <- ifelse(SeuratObject.list$TSS.enrichment > 2, "High", "Low")
DefaultAssay(SeuratObject.list)<-"RNA"
SeuratObject.list[["percent.mt"]] <- PercentageFeatureSet(SeuratObject.list, pattern = "^MT-")
SeuratObject.list[["percent_ribo"]] <- PercentageFeatureSet(SeuratObject.list, pattern = "^RP[SL]")
return(SeuratObject.list)
}
After the QC process, we can use this function to make a data frame with only the metadata of library.
md_df<- data.frame()
make.metadata.df<- function (all_data){
md_df = rbind(md_df, all_data@meta.data)
return(md_df)
}
tss.enrich.plot <- function(seurat_object_df) {
for (i in seq_along(seurat_object_df)){
DefaultAssay(seurat_object_df[[i]]) <- "ATAC"
fh <- FragmentHistogram(object = seurat_object_df[[i]])
p <- TSSPlot(seurat_object_df[[i]], group.by = 'tss.level') +
ggtitle(paste0("TSS enrichment score of ",unique(names(seurat_object_df[i])))) +
theme_minimal() +
geom_vline(xintercept =c(0,220),linetype = "dashed", colour = "black") +
xlab(bquote('Relative Position (bp form TSS')) +
ylab(expression("Relative enrichement"))
print(p)
}
}
Signac uses information from three related input files (created using CellRanger ARC):
ReadExperiment 1
path_exp1<-"../data/Experiment/1"
SeuratObject.Exp1<-make.SeuratObject.list(path_exp1)
Read Experiment 2
path_exp2<-"../data/Experiment/2"
SeuratObject.Exp2<-make.SeuratObject.list(path_exp2)
list.name is a list of id sample with each library name.
list.name<-list(
pd9avu0k_kf9ft6kk="BCLL_14_T_1",
vuuqir4h_wfkyb5v8 ="BCLL_14_T_2",
admae8w2_89i88tvv="BCLL_15_T_1" ,
sr20954q_yiuuoxng="BCLL_15_T_2" ,
kmbfo1ab_ie02b4ny="BCLL_2_T_1" ,
ryh4el3i_biv0w7ca="BCLL_2_T_2" ,
bs2e7lr7_mdfwypvz="BCLL_2_T_3" ,
co7dzuup_xuczw9vc="BCLL_9_T_1" ,
qmzb59ew_t11l8dzm="BCLL_9_T_2" ,
ulx1v6sz_8a2nvf1c="BCLL_8_T_1" ,
wdp0p728_jf6w68km="BCLL_8_T_2")
list.name<-list.name[order(names(list.name))]
SeuratObject.list<- c(SeuratObject.Exp1,SeuratObject.Exp2)
SeuratObject.list<-SeuratObject.list[order(names(SeuratObject.list))]
#Change sample id by library name.
names(SeuratObject.list)<- c(list.name[[1]],list.name[[2]],list.name[[3]],list.name[[4]],list.name[[5]],list.name[[6]],list.name[[7]],list.name[[8]],list.name[[9]],list.name[[10]],list.name[[11]])
Quality control metrics are collected to determine library complexity, signal to noise ratios, fragment length distribution per replicate (where available), and reproducibility.
Here we will create a list called ’all_data` which has all the quality control of each Seurat object of each samples.
Look at the distributions before deciding on cutoffs.
all_data = lapply(SeuratObject.list, QC)
We will creat a data frame with only the metadata of each sample which will be called
metada_df.
df<-lapply(all_data, make.metadata.df)
##create a data frame of all metadata
metadata_df <- map_df(df, ~as.data.frame(.x), .id="id")
metadata_df<-metadata_df[order(metadata_df$id),]
write.csv(metadata_df, "../results/tables/metadata_df_qc.csv", row.names=TRUE, quote=FALSE)
num_cell<- as.data.frame(ddply(metadata_df, .(id), nrow))
#change column names
colnames(num_cell) <- c("library_name","initial_cells")
One important step is to know of the exactly number of initial cells we have in each sample to know the percentage of cell we are going to keep after the filtering process.
|
Total Number
|
|
|---|---|
| library_name | initial_cells |
| BCLL_14_T_1 | 6919 |
| BCLL_14_T_2 | 6103 |
| BCLL_15_T_1 | 5788 |
| BCLL_15_T_2 | 5845 |
| BCLL_2_T_1 | 10837 |
| BCLL_2_T_2 | 7910 |
| BCLL_2_T_3 | 7460 |
| BCLL_8_T_1 | 7181 |
| BCLL_8_T_2 | 7861 |
| BCLL_9_T_1 | 5572 |
| BCLL_9_T_2 | 5530 |
ggviolin(
metadata_df,
x="id",
y="nFeature_ATAC",
fill="steelblue",
add="boxplot",
title = "Number of detected peaks",
ggtheme = theme_pubr(x.text.angle = 20),
add.params = list(fill = "white"))+
geom_hline(yintercept = 18000, linetype='dashed', col = 'red')+
labs(x = "Library name", y = "Number of detected Peaks")
We can easily see that the sample BCLL_2 which belong to the adult sample have a sifnifcant different of number of peak distribution. This difference could low the general average of number of peaks.
The median number of peaks is: 3331
The mean number of peaks is: 4653.39
The median number of peaks per library
aggregate(nFeature_ATAC ~ id, data = metadata_df, median)
id nFeature_ATAC
1 BCLL_14_T_1 5009.0
2 BCLL_14_T_2 5001.0
3 BCLL_15_T_1 4264.0
4 BCLL_15_T_2 4160.0
5 BCLL_2_T_1 1611.0
6 BCLL_2_T_2 1598.5
7 BCLL_2_T_3 2223.0
8 BCLL_8_T_1 4656.0
9 BCLL_8_T_2 4355.0
10 BCLL_9_T_1 5552.0
11 BCLL_9_T_2 5461.5
In the table above shows clearly that the median of peaks of BCLL2 libraries are significantly different that the rest. Their median are almost 4 time lower than the other ones.
Nucleosome banding pattern: The histogram of DNA fragment sizes (determined from the paired-end sequencing reads) should exhibit a strong nucleosome banding pattern corresponding to the length of DNA wrapped around a single nucleosome. We calculate this per single cell, and quantify the approximate ratio of Di-nucleosomal(DI) and mono-nucleosomal(MONO) to nucleosome-free(NFR) fragments.
Single Cell ATAC read pairs produce detailed information about nucleosome packing and positioning. The fragment length distribution captures the nucleosome positioning periodicity.
Histogram are divided by NFR, MonoNR and DiNR. Dashed red lines represent the thresholds we set according our observation which is base on the lowest parts of the histogram line. Back straight lines are threshold (147 and 294) base on bibliography, length of DNA wrapped around a single nucleosome. Ratio of fragments of each part made by both bibliography and our own threshold are represented in back and in red respectively.
The plot can be used to evaluate the quality of transposase reaction. We expect to find half or more of the of fragments within the nucleosome free regions (NFR) to confirm that our data is high quality and the transposase worked properly.
Biobliography information about the minimum and maximum thresolds. Nucleosome signal. The length of DNA wrapped around a single nucleosome has been experimentally determined as 147 bp. As Tn5 has a strong preference to integrate into nucleosome-free DNA, successful ATAC-seq experiments typically exhibit a depletion of DNA fragments with lengths that are multiples of 147 bp. We defined the nucleosome signal QC metric in Signac as the ratio of mononucleosomal (147–294 bp) to nucleosome-free (<147 bp) fragments sequenced for the cell, as a way of quantifying the expected depletion of nucleosome-length DNA fragments.
nucleosome.bp <- function(seurat_object_df) {
for (i in seq_along(seurat_object_df)){
DefaultAssay(seurat_object_df[[i]]) <- "ATAC"
fh <- FragmentHistogram(object = seurat_object_df[[i]])
min.threshold<-147
max.threshold<-294
my.min.threshold <- 127
my.max.threshold <- 274
NFR<- (length(which(fh$data$length < min.threshold))/ nrow(fh$data)) *100
MonoNR<- (length(which(fh$data$length > min.threshold &fh$data$length < max.threshold ))/ nrow(fh$data)) *100
DiNR<- (length(which(fh$data$length > max.threshold))/ nrow(fh$data)) *100
my.NFR<- (length(which(fh$data$length < my.min.threshold))/ nrow(fh$data)) *100
my.MonoNR<- (length(which(fh$data$length > my.min.threshold & fh$data$length < my.max.threshold ))/ nrow(fh$data)) *100
my.DiNR<- (length(which(fh$data$length > my.max.threshold))/ nrow(fh$data)) *100
p <- ggplot(fh$data, aes(length)) +
ggtitle(paste0("Nucleosome banding pattern of ",unique(names(seurat_object_df[i])))) +
geom_histogram(binwidth = 1, fill = "steelblue") +
geom_density(aes(y = ..count..), bw = 1, alpha = 0, col = "black", lwd = 1) + scale_x_continuous(limits = c(0, 550)) +
geom_vline(xintercept = c(my.min.threshold, my.max.threshold),linetype="dashed",colour = "red") +
geom_vline(xintercept = c(min.threshold, max.threshold)) +
theme_minimal()+
geom_text(x = 80, y = 50, label = paste("NFR",round(NFR, 2)), size = 3) +
geom_text(x = 200, y = 50, label = paste("MONO",round(MonoNR, 2)), size = 3) +
geom_text(x = 350, y = 50, label = paste("DI",round(DiNR, 2)), size = 3)+
geom_text(x = 80, y = 100, label = round(my.NFR, 2), size = 3, color="red") +
geom_text(x = 200, y = 100, label = round(my.MonoNR, 2), size = 3, color="red") +
geom_text(x = 350, y = 100, label = round(my.DiNR, 2), size = 3, color="red")
print(p)
}
}
t<-nucleosome.bp(all_data)
Insert size distributions of the aggregated single cells from all eleven samples exhibited clear nucleosoma banding patterns.
ns <- ggviolin(metadata_df,
x = "id", fill = "steelblue", x.text.angle = 45,
y = "nucleosome_signal",
title = "Nucleosome signal distribution(log10)",
) + scale_y_log10() + geom_hline(yintercept = nucleosome_signal_atac, linetype='dashed', col = 'red')
ns + labs(x = "Library name", y = "Nucleosome signal")
Warning: Transformation introduced infinite values in continuous y-axis
Warning: Removed 8 rows containing non-finite values (stat_ydensity).
## TSS enrichment
Transcriptional start site (TSS) enrichment score. The ENCODE project has defined an ATAC-seq targeting score based on the ratio of fragments centered at the TSS to fragments in TSS-flanking regions. Poor ATAC-seq experiments typically will have a low TSS enrichment score.
TSS scores = the depth of TSS (each 100bp window within 1000 bp each side) / the depth of end flanks (100bp each end).
TSSE score = max(mean(TSS score in each window))
To plot TSS enrichment profiles, we use the TSSPlot() function. TSS enrichment profiles show a clear peak in the center and a smaller shoulder peak at the downstream of the TSS (TSS + 220) could be the spacing region between two flanking nucleosomes.
Two vertical back dashed lines are given at the consensus TSS (hg38 genome) and at TSS + 220 bp.
We can see clearly that the reads concentrate around the TSS, with a prominent peak a bit upstream
tss<-tss.enrich.plot(all_data)
Now, we are going to show the distribution of the TSS scores for each samples. Red dashes line represents the cut off used to slip the data by low and high TSS score.
ls<- ggviolin(metadata_df,
x = "id", fill = "steelblue", x.text.angle = 25,y = "TSS.enrichment") +
geom_hline(yintercept = 2, linetype='dashed', col = 'red')+
labs(x = "Library name", y = "TSS enrichment score")
ls1<- ls + scale_y_log10()+ ggtitle("TSS enrichment score (log10)")
ls + ggtitle("TSS enrichment score")
ls1
The library size of ATAC-seq make reference to the number of reads
ls1<- ggboxplot(metadata_df,
x = "id", fill = "steelblue", x.text.angle = 25,
y = "nCount_ATAC", title = "Library size (log10)") +
scale_y_log10() +
geom_hline(yintercept = c(min_lib_size_atac,max_lib_size_atac), linetype='dashed', col = 'red') +
labs(x = "Library name", y = "Number of counts ATAC-seq")
ls1
ls<- ggviolin(metadata_df,
x = "id", fill = "steelblue", x.text.angle = 25,
y = "nCount_ATAC", title = "Library size (log10)" ,add="boxplot", add.params = list(fill = "white")) +
scale_y_log10() +
geom_hline(yintercept = c(min_lib_size_atac,max_lib_size_atac), linetype='dashed', col = 'red')+
labs(x = "Library name", y = "Number of counts ATAC-seq")
ls
lib_size_hist_log <- metadata_df %>%
ggplot(aes_string("nCount_ATAC")) +
geom_histogram(bins = 100) +
labs(x = "Library Size (log10)", y = "Number of Cells")+
theme_pubr()+
scale_x_log10() +
geom_vline(xintercept = c(min_lib_size_atac,max_lib_size_atac), linetype = "dashed", color = "red")
lib_size_hist <- lib_size_hist_log +
scale_x_continuous(limits = c(0, 5000)) +
xlab("Library Size") +
theme_pubr()
lib_size_hist_log
lib_size_hist
Lower and the upper thresholds that we have set are shown as red dashed lines.
Here we can see the distribution of the library size of RNAm each barcoded cell from each sample.
ls<- ggviolin(metadata_df,
x = "id", fill = "steelblue", x.text.angle = 25,
y = "nCount_RNA", title = "Library size of RNA-seq (log10)" ,add="boxplot", add.params = list(fill = "white")) +
scale_y_log10() +
geom_hline(yintercept = c(min_lib_size_rna,max_lib_size_rna), linetype='dashed', col = 'red')+
labs(x = "Library name", y = "Number of counts RNA-seq")
ls
lib_size_hist_log <- metadata_df %>%
ggplot(aes_string("nCount_RNA")) +
geom_histogram(bins = 100) +
labs(x = "Library Size (total UMI)(log10)", y = "Number of Cells")+
theme_pubr()+
scale_x_log10() +
geom_vline(xintercept = c(min_lib_size_rna,max_lib_size_rna), linetype = "dashed", color = "red")
lib_size_hist <- lib_size_hist_log +
scale_x_continuous(limits = c(0, 3000)) +
xlab("Library Size (total UMI)") +
theme_pubr()
lib_size_hist_log + lib_size_hist
Number of detected genes are also the library complexity.
High number of detected genes may be an indication of duplicate/multiple cells. But can also be a larger celltype.
We expect to fin a large library complexity in a large library size. If we have a low library complexity ( low detected genes) in a large library size, it means that there a lot of duplicate and copie fragments.
We have set a lower threshold in order to remove cells with too few genes.
metadata_df$has_high_lib_size <-
metadata_df$nCount_RNA > min_lib_size_rna &
metadata_df$nFeature_RNA > min_ngenes_rna
We use log10 scale to have a better visualization of the distribution data.
ls<- ggviolin(metadata_df,
x = "id",
fill = "steelblue",
x.text.angle = 25,
y = "nFeature_RNA",
title = "Number of detected genes (log10)",
add="boxplot",
add.params = list(fill = "white")) +
geom_hline(yintercept = c(min_ngenes_rna,max_ngenes_rna), linetype='dashed', col = 'red')+
scale_y_log10()
ls
We want to delete the fist peak which represent technical issues and we are not interesting on keeping that because it will bias our results.
ngenes_hist <- metadata_df %>%
ggplot(aes_string("nFeature_RNA")) +
geom_histogram(bins = 100) +
labs(x = "Number of Detected Genes", y = "Number of Cells") +
theme_pubr()+
geom_vline(xintercept = min_ngenes_rna, linetype = "dashed", color = "red") +
geom_vline(xintercept = max_ngenes_rna, linetype = "dashed", color = "red")
ngenes_hist
The median of total number of genes is: 1764
The median number of genes per library:
aggregate(nFeature_RNA ~ id, data = metadata_df, median)
id nFeature_RNA
1 BCLL_14_T_1 1736
2 BCLL_14_T_2 1659
3 BCLL_15_T_1 1713
4 BCLL_15_T_2 1895
5 BCLL_2_T_1 1291
6 BCLL_2_T_2 1584
7 BCLL_2_T_3 1586
8 BCLL_8_T_1 2318
9 BCLL_8_T_2 2117
10 BCLL_9_T_1 1997
11 BCLL_9_T_2 2142
We can see that BCLL2 samples on average have lower number of detected genes than majority of samples.
tt<-lapply(all_data, tecnical.cell)
df.ti<-lapply(tt, make.metadata.df)
metadata_df.ttt <- map_df(df.ti, ~as.data.frame(.x), .id="id")
metadata_df.ttt<-metadata_df.ttt[order(metadata_df.ttt$id),]
ngenes_hist <- metadata_df.ttt %>%
ggplot(aes_string("nFeature_RNA")) +
geom_histogram(bins = 100) +
labs(x = "Number of Detected Genes", y = "Number of Cells") +
theme_pubr()+
geom_vline(xintercept = min_ngenes_rna, linetype = "dashed", color = "red")
ngenes_hist1<- ngenes_hist + geom_vline(xintercept = max_ngenes_rna, linetype = "dashed", color = "red")
ngenes_hist1 + ngenes_hist
Here we can see that there is a corelation between the library size and the library complexity in each sample. Tha means that we have a high variability of genes. It can suggest that more average number of read more genes we find.
corr_lib_size_lib_comp<-ggscatter(metadata_df,y="nFeature_RNA", x="nCount_RNA",
color="blue",
add="reg.line",
conf.int = T,
cor.coef = T,
ylim=c(0,20000),
title = "Correlation between lib. size and lib. complexity",
cor.coeff.args = list(method = "pearson", label.x = 10000, label.sep = "\n"))
corr_lib_size_lib_comp1<-ggscatter(metadata_df,y="nFeature_RNA", x="nCount_RNA",
color="id",
add="reg.line",
conf.int = T,
cor.coef = T,
ylim=c(0,20000),
title = "Correlation between lib. size and lib. complexity",
cor.coeff.args = list(method = "pearson", label.x = 10000, label.sep = "\n"))
corr_lib_size_lib_comp + labs(y = "Number of Detected Genes", x = "Library size")
corr_lib_size_lib_comp1 + labs(y = "Number of Detected Genes", x = "Library size")
Suggested that when the cell membrane is broken, cytoplasmic RNA will be lost, but not RNAs enclosed in the mitochondria. High content of mitochondrial RNA may indicate apoptosis.
pct_mit_hist <- metadata_df %>%
ggplot(aes_string("percent.mt")) +
geom_histogram(bins = 100) +
labs(x = "% Mitochondrial Expression", y = "Number of Cells") +
theme_pubr()+
geom_vline(xintercept = max_percent_mit, linetype = "dashed", color = "red") +
scale_x_continuous(limits = c(0, 100))
pct_mit_hist
corr_lib_size_lib_comp.ttt<-ggscatter(metadata_df.ttt,y="percent.mt", x="nFeature_RNA",
color="blue",
add="reg.line",
conf.int = T,
cor.coef = T,
ylim=c(0,100),
title = "Correlation between number of detected genes vs % of RNA mitochondrial",
cor.coeff.args = list(method = "pearson", label.x = 30,label.y = 90, label.sep = "\n"))
corr_lib_size_lib_comp.ttt + labs(y = "% of RNA mitocondrial", x = "Features")
Possible that degradation of RNA leads to more templating of rRNA-fragments. Proportion ribosomal proteins may be an artifact from handling of samples for cells of the same celltype.
pct_ribo_hist <- metadata_df %>%
ggplot(aes_string("percent_ribo")) +
geom_histogram(bins = 100) +
labs(x = "% Ribosomal Expression", y = "Number of Cells") +
theme_pubr()+
scale_x_continuous(limits = c(0, 100))
pct_ribo_hist
PCA (shall we do it?)
Examine PCA/tSNE before/after filtering and make a judgment on whether to remove more/less cells. (Scater tutorial)
Check for batch effects in PCA
After calculating the quality control metrics for both ATAC and RNA assay
we are going to remove cells that are outliers for these metrics which will be save in a
seurt oBject list called filtered.cell
filtered.cell<- lapply(all_data, filtering.cell)
#data frame of the filtered cells samples
filtered.cell.df<-lapply(filtered.cell, make.metadata.df)
filtered.cell.df <- map_df(filtered.cell.df, ~as.data.frame(.x), .id="id")
filtered.cell.df<-filtered.cell.df[order(filtered.cell.df$id),]
Data frame of the number of cell with the metrics.
num_fil_cell<- as.data.frame(ddply(filtered.cell.df, .(id), nrow))
#change column names
colnames(num_fil_cell) <- c("library_name","filt_cells")
We are going to collect all that cells that are outliers of the QC metrics and will be save in a list of seurat object called `filtered.out.cells.
#Colleting cells are out of QC thresholds
filtered.out.cells<-lapply(all_data, filter.out.cell)
Create a data frame with the metadata of the filtered out cells.
#data frame of the filtered out cells metadata.
filtered.out.cell.df<-lapply(filtered.out.cells, make.metadata.df)
filtered.out.cell.df <- map_df(filtered.out.cell.df, ~as.data.frame(.x), .id="id")
filtered.out.cell.df<-filtered.out.cell.df[order(filtered.out.cell.df$id),]
#Data frame of number of cells that were out of he QC metrics
num_filOut_cell<- as.data.frame(ddply(filtered.out.cell.df, .(id), nrow))
colnames(num_filOut_cell) <- c("library_name","filt_out_cells")
Merge the initial number of cell, the filtered cell and the filtered out cells data frames
ini_filt_df<-merge(num_cell, num_fil_cell,by = "library_name")
ini_filt_df<-merge(ini_filt_df,num_filOut_cell, by = "library_name")
|
Total Number
|
|||
|---|---|---|---|
| library_name | initial_cells | filt_cells | filt_out_cells |
| BCLL_14_T_1 | 6919 | 6622 | 297 |
| BCLL_14_T_2 | 6103 | 5839 | 264 |
| BCLL_15_T_1 | 5788 | 5511 | 277 |
| BCLL_15_T_2 | 5845 | 5532 | 313 |
| BCLL_2_T_1 | 10837 | 9255 | 1582 |
| BCLL_2_T_2 | 7910 | 6732 | 1178 |
| BCLL_2_T_3 | 7460 | 6511 | 949 |
| BCLL_8_T_1 | 7181 | 6883 | 298 |
| BCLL_8_T_2 | 7861 | 7506 | 355 |
| BCLL_9_T_1 | 5572 | 5071 | 501 |
| BCLL_9_T_2 | 5530 | 5094 | 436 |
Difference between initial number of cell and filtered number of cell
#create a column with the number of deleted cells which should be the same than the filtered out cell.
ini_filt_df$del_cells <- (ini_filt_df$initial_cells - ini_filt_df$filt_cells)
Percentages of the deleted and QC pass-filter cells
ini_filt_df$pct_keep_cells <- round(((ini_filt_df$filt_cells/ini_filt_df$initial_cells)*100),2)
ini_filt_df$pct_del_cells <- round((100-ini_filt_df$pct_keep_cells),2)
Meaning of each column:
pct_del_cells= percentage of deleted cells
pct_keep_cells= percentage of keep cells
initial_cells= total number of initial cells
filt_cells= total number of filtered cells
filt_out_cells= total number of filtered out cells (it should match with del_cell column)
del_cell= total number of deleted cells
|
Total Number
|
Percentage %
|
|||||
|---|---|---|---|---|---|---|
| library_name | initial_cells | filt_cells | filt_out_cells | del_cells | pct_keep_cells | pct_del_cells |
| BCLL_14_T_1 | 6919 | 6622 | 297 | 297 | 95.71 | 4.29 |
| BCLL_14_T_2 | 6103 | 5839 | 264 | 264 | 95.67 | 4.33 |
| BCLL_15_T_1 | 5788 | 5511 | 277 | 277 | 95.21 | 4.79 |
| BCLL_15_T_2 | 5845 | 5532 | 313 | 313 | 94.64 | 5.36 |
| BCLL_2_T_1 | 10837 | 9255 | 1582 | 1582 | 85.40 | 14.60 |
| BCLL_2_T_2 | 7910 | 6732 | 1178 | 1178 | 85.11 | 14.89 |
| BCLL_2_T_3 | 7460 | 6511 | 949 | 949 | 87.28 | 12.72 |
| BCLL_8_T_1 | 7181 | 6883 | 298 | 298 | 95.85 | 4.15 |
| BCLL_8_T_2 | 7861 | 7506 | 355 | 355 | 95.48 | 4.52 |
| BCLL_9_T_1 | 5572 | 5071 | 501 | 501 | 91.01 | 8.99 |
| BCLL_9_T_2 | 5530 | 5094 | 436 | 436 | 92.12 | 7.88 |
Mean of percentage of cell that pass the QC metrics is: 92.13 %
We are deleting on average 7.87 % of bad quality cells.
pct_mit_hist <- filtered.out.cell.df %>%
ggplot(aes_string("percent.mt")) +
geom_histogram(bins = 100) +
labs(x = "% Mitochondrial Expression", y = "Number of Cells") +
theme_pubr()+
geom_vline(xintercept = max_percent_mit, linetype = "dashed", color = "red") +
scale_x_continuous(limits = c(0, 100))
pct_mit_hist
saveRDS(all_data,"../results/R_objects/1.tonsil_multiome_QC.rds")
saveRDS(filtered.cell,"../results/R_objects/2.tonsil_multiome_filtered.rds")
saveRDS(filtered.out.cells,"../results/R_objects/3.tonsil_multiome_filtered_Out.rds")
sessionInfo()
R version 4.1.2 (2021-11-01)
Platform: x86_64-apple-darwin17.0 (64-bit)
Running under: macOS Big Sur 10.16
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
locale:
[1] es_ES.UTF-8/es_ES.UTF-8/es_ES.UTF-8/C/es_ES.UTF-8/es_ES.UTF-8
attached base packages:
[1] stats4 stats graphics grDevices utils datasets methods
[8] base
other attached packages:
[1] scater_1.22.0 scuttle_1.4.0
[3] SingleCellExperiment_1.16.0 SummarizedExperiment_1.24.0
[5] MatrixGenerics_1.6.0 matrixStats_0.61.0
[7] magick_2.7.3 RColorBrewer_1.1-2
[9] hdf5r_1.3.5 EnsDb.Hsapiens.v86_2.99.0
[11] ensembldb_2.18.2 AnnotationFilter_1.18.0
[13] GenomicFeatures_1.46.1 AnnotationDbi_1.56.2
[15] Biobase_2.54.0 GenomicRanges_1.46.1
[17] GenomeInfoDb_1.30.0 IRanges_2.28.0
[19] S4Vectors_0.32.3 BiocGenerics_0.40.0
[21] data.table_1.14.2 reshape2_1.4.4
[23] plyr_1.8.6 forcats_0.5.1
[25] stringr_1.4.0 dplyr_1.0.7
[27] purrr_0.3.4 readr_2.1.1
[29] tidyr_1.1.4 tibble_3.1.6
[31] tidyverse_1.3.1 ggpubr_0.4.0
[33] ggplot2_3.3.5 SeuratObject_4.0.4
[35] Seurat_4.0.6 Signac_1.5.0
[37] usethis_2.1.5 kableExtra_1.3.4
[39] knitr_1.36 BiocStyle_2.22.0
loaded via a namespace (and not attached):
[1] rappdirs_0.3.3 SnowballC_0.7.0
[3] rtracklayer_1.54.0 scattermore_0.7
[5] bit64_4.0.5 irlba_2.3.5
[7] DelayedArray_0.20.0 rpart_4.1-15
[9] KEGGREST_1.34.0 RCurl_1.98-1.5
[11] generics_0.1.1 ScaledMatrix_1.2.0
[13] cowplot_1.1.1 RSQLite_2.2.9
[15] RANN_2.6.1 future_1.23.0
[17] bit_4.0.4 tzdb_0.2.0
[19] spatstat.data_2.1-2 webshot_0.5.2
[21] xml2_1.3.3 lubridate_1.8.0
[23] httpuv_1.6.4 assertthat_0.2.1
[25] viridis_0.6.2 xfun_0.29
[27] hms_1.1.1 jquerylib_0.1.4
[29] evaluate_0.14 promises_1.2.0.1
[31] fansi_0.5.0 restfulr_0.0.13
[33] progress_1.2.2 dbplyr_2.1.1
[35] readxl_1.3.1 igraph_1.2.10
[37] DBI_1.1.1 htmlwidgets_1.5.4
[39] sparsesvd_0.2 spatstat.geom_2.3-1
[41] ellipsis_0.3.2 backports_1.4.1
[43] bookdown_0.24 sparseMatrixStats_1.6.0
[45] biomaRt_2.50.1 deldir_1.0-6
[47] vctrs_0.3.8 ROCR_1.0-11
[49] abind_1.4-5 cachem_1.0.6
[51] withr_2.4.3 ggforce_0.3.3
[53] BSgenome_1.62.0 checkmate_2.0.0
[55] sctransform_0.3.2 GenomicAlignments_1.30.0
[57] prettyunits_1.1.1 goftest_1.2-3
[59] svglite_2.0.0 cluster_2.1.2
[61] lazyeval_0.2.2 crayon_1.4.2
[63] labeling_0.4.2 pkgconfig_2.0.3
[65] slam_0.1-49 tweenr_1.0.2
[67] vipor_0.4.5 nlme_3.1-153
[69] ProtGenerics_1.26.0 nnet_7.3-16
[71] rlang_0.4.12 globals_0.14.0
[73] lifecycle_1.0.1 miniUI_0.1.1.1
[75] filelock_1.0.2 BiocFileCache_2.2.0
[77] rsvd_1.0.5 modelr_0.1.8
[79] dichromat_2.0-0 cellranger_1.1.0
[81] polyclip_1.10-0 lmtest_0.9-39
[83] Matrix_1.3-4 ggseqlogo_0.1
[85] carData_3.0-4 zoo_1.8-9
[87] base64enc_0.1-3 beeswarm_0.4.0
[89] reprex_2.0.1 ggridges_0.5.3
[91] png_0.1-7 viridisLite_0.4.0
[93] rjson_0.2.20 bitops_1.0-7
[95] KernSmooth_2.23-20 Biostrings_2.62.0
[97] blob_1.2.2 DelayedMatrixStats_1.16.0
[99] parallelly_1.30.0 jpeg_0.1-9
[101] rstatix_0.7.0 ggsignif_0.6.3
[103] beachmat_2.10.0 scales_1.1.1
[105] memoise_2.0.1 magrittr_2.0.1
[107] ica_1.0-2 zlibbioc_1.40.0
[109] compiler_4.1.2 BiocIO_1.4.0
[111] fitdistrplus_1.1-6 Rsamtools_2.10.0
[113] cli_3.1.0 XVector_0.34.0
[115] listenv_0.8.0 patchwork_1.1.1
[117] pbapply_1.5-0 htmlTable_2.3.0
[119] Formula_1.2-4 MASS_7.3-54
[121] mgcv_1.8-38 tidyselect_1.1.1
[123] stringi_1.7.6 highr_0.9
[125] yaml_2.2.1 BiocSingular_1.10.0
[127] latticeExtra_0.6-29 ggrepel_0.9.1
[129] grid_4.1.2 VariantAnnotation_1.40.0
[131] sass_0.4.0 fastmatch_1.1-3
[133] tools_4.1.2 future.apply_1.8.1
[135] parallel_4.1.2 rstudioapi_0.13
[137] foreign_0.8-81 lsa_0.73.2
[139] gridExtra_2.3 farver_2.1.0
[141] Rtsne_0.15 digest_0.6.29
[143] BiocManager_1.30.16 shiny_1.7.1
[145] qlcMatrix_0.9.7 Rcpp_1.0.7
[147] car_3.0-12 broom_0.7.10
[149] later_1.3.0 RcppAnnoy_0.0.19
[151] httr_1.4.2 biovizBase_1.42.0
[153] colorspace_2.0-2 rvest_1.0.2
[155] XML_3.99-0.8 fs_1.5.2
[157] tensor_1.5 reticulate_1.22
[159] splines_4.1.2 uwot_0.1.11
[161] RcppRoll_0.3.0 spatstat.utils_2.3-0
[163] plotly_4.10.0 systemfonts_1.0.3
[165] xtable_1.8-4 jsonlite_1.7.2
[167] R6_2.5.1 Hmisc_4.6-0
[169] pillar_1.6.4 htmltools_0.5.2
[171] mime_0.12 glue_1.6.0
[173] fastmap_1.1.0 BiocParallel_1.28.3
[175] BiocNeighbors_1.12.0 codetools_0.2-18
[177] utf8_1.2.2 lattice_0.20-45
[179] bslib_0.3.1 spatstat.sparse_2.1-0
[181] ggbeeswarm_0.6.0 curl_4.3.2
[183] leiden_0.3.9 survival_3.2-13
[185] rmarkdown_2.11 docopt_0.7.1
[187] munsell_0.5.0 GenomeInfoDbData_1.2.7
[189] haven_2.4.3 gtable_0.3.0
[191] spatstat.core_2.3-2